Author

Nathan Herling

Published

June 20, 2025

Code
#--------------------->
#################
# Package Setup #
#################
#Check if pacman [package manager] is installed, if not install it.
#throw [FYI] alert either way.
if (!requireNamespace("pacman", quietly = TRUE)) {
  message("Installing 'pacman' (not found locally)...")
  install.packages("pacman")
} else {
  message("[FYI]\n'pacman' already installed — skipping install.")
}
[FYI]
'pacman' already installed — skipping install.
Code
# use this line for installing/loading
# pacman::p_load()
# - packages to load stored in a variable (vector)
pkgs <- c(
  "tidyverse",
  "glue",
  "scales",
  "lubridate",
  "patchwork",
  "ggh4x",
  "ggrepel",
  "openintro",
  "ggridges",
  "dsbox",
  "janitor",
  "here",
  "knitr",
  "ggthemes",
  "ggplot2",
  "kableExtra",
  "palmerpenguins",
  "grid",
  "htmltools",
  "plotly",
  "ggforce",
  "cowplot",
  "magick",
  "forcats",
  "stringr",
  "viridis"
)
# - load from the character array/vector
pacman::p_load(char=pkgs)

# - install tidyverse/dsbox directly from Git Hub
# - this allows for the possible need to install on a repo. pull.
# - and, if it's already installed just thorw an alert.
if (!requireNamespace("dsbox", quietly = TRUE)) {
  message("Installing 'dsbox' from GitHub (not found locally)...")
  suppressMessages(devtools::install_github("tidyverse/dsbox"))
} else {
  message("[FYI]\n'dsbox' already installed — skipping GitHub install.")
}
[FYI]
'dsbox' already installed — skipping GitHub install.
Code
# - alert to user packages loaded.
# Set number of columns (adjustable)
n_cols <- 4

# Add * to each package name
pkgs <- paste0("* ", pkgs)

# Calculate number of rows based on total packages
n_rows <- ceiling(length(pkgs) / n_cols)

# Pad with empty strings to complete grid
pkgs_padded <- c(pkgs, rep("", n_rows * n_cols - length(pkgs)))

# Create matrix (fill by row)
pkg_matrix <- matrix(pkgs_padded, nrow = n_rows, byrow = TRUE)

# Print header
cat("The packages loaded:")
The packages loaded:
Code
# Loop and print each row (use invisible to suppress NULL)
invisible(apply(pkg_matrix, 1, function(row) {
  cat(paste(format(row, width = 22), collapse = ""), "\n")
}))
* tidyverse           * glue                * scales              * lubridate            
* patchwork           * ggh4x               * ggrepel             * openintro            
* ggridges            * dsbox               * janitor             * here                 
* knitr               * ggthemes            * ggplot2             * kableExtra           
* palmerpenguins      * grid                * htmltools           * plotly               
* ggforce             * cowplot             * magick              * forcats              
* stringr             * viridis                                                          
Code
#-------------------------->
######################
# Basic set Theme up #
######################
# ---- set theme for ggplot2
ggplot2::theme_set(ggplot2::theme_minimal(base_size = 14))

# set width of code output
options(width = 65)

# set figure parameters for knitr
knitr::opts_chunk$set(
  fig.width = 7,        # 7" width
  fig.asp = 0.618,      # the golden ratio
  fig.retina = 3,       # dpi multiplier for displaying HTML output on retina
  fig.align = "center", # center align figures
  dpi = 300             # higher dpi, sharper image
)
## ---- end theme set up

(>>>>) - function block

In an effort to reduce repeating code a function block was created.

Code
# ............ A function block, to handle Q3,Q4 with minimal code duplication

# - size as a variable
set_dot_size <- 1
# Function for the "All" group plot (g0)
plot_all <- function(data) {
  ggplot(data, aes(x = explanatory_value, y = mean)) +
    geom_errorbar(aes(ymin = low, ymax = high), width = 0.2) +
    geom_point(size = set_dot_size, color = "black") +
    coord_flip() +
    facet_grid(
      rows = vars(explanatory),
      cols = vars(response),
      labeller = labeller(
        response = as_labeller(response_labels),
        explanatory = as_labeller(explanatory_labels)
      )
    ) +
    theme_minimal(base_size = 11) +
    labs(
      title = "COVID-19 Vaccine Attitudes by Demographic Group",
      x = NULL,
      y = NULL
    ) +
    theme(
      plot.title = element_text(hjust = 0.5),
      strip.background = element_rect(fill = strip_fill_color, color = "black"),
      strip.placement = strip_placement,
      strip.text.x = element_text(
        vjust = 0.5,
        size = strip_text_size,
        margin = margin(t = 20, b = 10, r = 5, l = 5)
      ),
      strip.text.y.right = element_text(
        angle = 0,
        vjust = 0.5,
        margin = margin(t = 10, b = 10, r = 15, l = 15)
      ),
      axis.text.y = element_blank(),
      axis.text.x = element_blank(),
      axis.ticks.x = element_blank()
    )
}

# Function for the Age plot (g1)
plot_age <- function(data) {
  ggplot(data, aes(x = explanatory_value, y = mean, group = explanatory_value)) +
    geom_errorbar(aes(ymin = low, ymax = high), width = 0.2) +
    geom_point(size = set_dot_size, color = "black") +
    coord_flip() +
    facet_grid(
      rows = vars(explanatory),
      cols = vars(response),
      labeller = labeller(
        explanatory = as_labeller(explanatory_labels)
      )
    ) +
    theme_minimal(base_size = 12) +
    labs(
      x = NULL,
      y = NULL
    ) +
    theme(
      strip.background = element_rect(fill = strip_fill_color, color = "black"),
      strip.placement = strip_placement,
      strip.text.x = element_blank(),
      strip.text.y.right = element_text(
        angle = 0,
        vjust = 0.5,
        margin = margin(t = 10, b = 10, r = 13, l = 13)
      ),
      axis.text.y = element_text(size = 10),
      panel.spacing = unit(1, "lines"),
      axis.text.x = element_blank(),
      axis.ticks.x = element_blank()
    )
}

# Function for the Gender plot (g2)
plot_gender <- function(data) {
  ggplot(data, aes(x = explanatory_value, y = mean, group = explanatory_value)) +
    geom_errorbar(aes(ymin = low, ymax = high), width = 0.2) +
    geom_point(size = set_dot_size, color = "black") +
    coord_flip() +
    facet_grid(
      rows = vars(explanatory),
      cols = vars(response),
      labeller = labeller(
        explanatory = as_labeller(explanatory_labels)
      )
    ) +
    theme_minimal(base_size = 12) +
    labs(
      x = NULL,
      y = NULL
    ) +
    theme(
      strip.background = element_rect(fill = strip_fill_color, color = "black"),
      strip.placement = strip_placement,
      strip.text.x = element_blank(),
      strip.text.y.right = element_text(
        angle = 0,
        vjust = 0.5,
        margin = margin(t = 10, b = 10, r = 5, l = 6)
      ),
      axis.text.y = element_text(size = 10),
      axis.text.x = element_blank(),
      panel.spacing = unit(1, "lines"),
      axis.ticks.x = element_blank()
    )
}

# Function for the Race plot (g3)
plot_race <- function(data) {
  ggplot(data, aes(x = explanatory_value, y = mean, group = explanatory_value)) +
    geom_errorbar(aes(ymin = low, ymax = high), width = 0.2) +
    geom_point(size = set_dot_size, color = "black") +
    coord_flip() +
    facet_grid(
      rows = vars(explanatory),
      cols = vars(response),
      labeller = labeller(
        explanatory = as_labeller(explanatory_labels)
      )
    ) +
    theme_minimal(base_size = 12) +
    labs(
      x = NULL,
      y = NULL
    ) +
    theme(
      strip.background = element_rect(fill = strip_fill_color, color = "black"),
      strip.placement = strip_placement,
      strip.text.x = element_blank(),
      strip.text.y.right = element_text(
        angle = 0,
        vjust = 0.5,
        margin = margin(t = 10, b = 10, r = 10, l = 10)
      ),
      axis.text.y = element_text(size = 10),
      panel.spacing = unit(1, "lines"),
      axis.text.x = element_blank(),
      axis.ticks.x = element_blank()
    )
}

# Function for the Ethnicity plot (g4)
plot_ethnicity <- function(data, sub_title_specific) {
  ggplot(data, aes(x = explanatory_value, y = mean, group = explanatory_value)) +
    geom_errorbar(aes(ymin = low, ymax = high), width = 0.2) +
    geom_point(size = set_dot_size, color = "black") +
    coord_flip() +
    facet_grid(
      rows = vars(explanatory),
      cols = vars(response),
      labeller = labeller(
        explanatory = as_labeller(explanatory_labels)
      )
    ) +
    theme_minimal(base_size = 10) +
    labs(
      x = NULL,
      y = paste0("Mean Likert score\n(Error bars: ", sub_title_specific, ")")
    ) +
    theme(
      strip.background = element_rect(fill = strip_fill_color, color = "black"),
      strip.placement = strip_placement,
      strip.text.x = element_blank(),
      strip.text.y.right = element_text(
        angle = 0,
        vjust = 0.5,
        margin = margin(t = 10, b = 10, r = 4, l = 7)
      ),
      axis.text.y = element_text(size = 10),
      axis.text.x = element_text(size = 10),
      axis.ticks.x = element_line(),
      panel.spacing = unit(1, "lines")
    )
}
# ..... prepare the variables.
# . ethnicity.
filter_ethnicity_data <- function(data) {
  data %>%
    filter(explanatory == "exp_ethnicity") %>%
    filter(is.finite(mean), is.finite(low), is.finite(high)) %>%
    mutate(
      explanatory_value = recode(as.character(explanatory_value),
                                 "1" = "Hispanic/Latino",
                                 "2" = "Non-Hispanic/Non-Latino"),
      explanatory_value = factor(explanatory_value, levels = c(
        "Hispanic/Latino", "Non-Hispanic/Non-Latino"
      )),
      explanatory = factor(explanatory, levels = c(
        "All", "exp_age_bin", "exp_gender", "exp_race", "exp_ethnicity"
      ))
    )
}

# . age
filter_age_data <- function(data) {
  data %>%
    filter(explanatory == "exp_age_bin") %>%
    filter(is.finite(mean), is.finite(low), is.finite(high)) %>%
    mutate(
      explanatory_value = recode(as.character(explanatory_value),
                                 "0" = "<20",
                                 "20" = "21-25",
                                 "25" = "26-30",
                                 "30" = ">30"
      ),
      explanatory_value = factor(explanatory_value, levels = c("<20", "21-25", "26-30", ">30")),
      explanatory = factor(explanatory, levels = c("All", "exp_age_bin", "exp_gender", "exp_race"))
    )
}


# . gender
filter_gender_data <- function(data) {
  data %>%
    filter(explanatory == "exp_gender") %>%
    filter(is.finite(mean), is.finite(low), is.finite(high)) %>%
    mutate(
      explanatory_value = as.character(explanatory_value),
      explanatory_value = fct_recode(factor(explanatory_value),
        "Prefer not to say" = "4",
        "Non-binary third gender" = "3",
        "Male" = "0",
        "Female" = "1"
      ),
      explanatory_value = factor(explanatory_value, levels = rev(c(
        "Prefer not to say",
        "Non-binary third gender",
        "Male",
        "Female"
      ))),
      explanatory = factor(explanatory, levels = c("All", "exp_age_bin", "exp_gender", "exp_race"))
    )
}


# . race
filter_race_data <- function(data) {
  data %>%
    filter(explanatory == "exp_race") %>%
    filter(is.finite(mean), is.finite(low), is.finite(high)) %>%
    mutate(
      explanatory_value = recode(as.character(explanatory_value),
        "1" = "American Indian/Alaska Native",
        "2" = "Asian",
        "3" = "Black/African American",
        "4" = "Native Hawaiian/Other Pacific Islander",
        "5" = "White"
      ),
      explanatory_value = factor(explanatory_value, levels = rev(c(
        "White",
        "Native Hawaiian/Other Pacific Islander",
        "Black/African American",
        "Asian",
        "American Indian/Alaska Native"
      ))),
      explanatory = factor(explanatory, levels = c("All", "exp_age_bin", "exp_gender", "exp_race"))
    )
}

1 - Du Bois challenge.

Du Bois challenge. Recreate the following visualization by W.E.B. Du Bois on family budgets split by income classes for 150 families in Atlanta, Georgia. This visualization was originally created using ink and watercolors.

Note: Since there appears to be some allowable creativity with the features reperesented. I left a scale on the bottom of the parchment, and left off the ‘connecting lines’ connecting the same colored segments together for the stacked bar charts. It ended up being a lot of code - to separately construct and place all pieces of the chart together. First effort. There may be a more efficient way to re-make the plot? . I rendered the output image as html - and I cannot git rid of the small ‘png 2’ label (atm).

png 2

A recreation of ‘Income and Expenditure of 150 Negro Families in Atlanta, GA, USA’
Some re-interpretations were taken:
- connecting lines between stacked bar chart areas were left off
- slightly modified ‘}’ grouping labels were used on right hand side of bar charts.
- an axis was rendered on the bottom to help visualize scale

2 - COVID survey - interpretation

Q2 - Interpret what’s occurring in the survey, and discuss any results that go against your intuition.
In a chart this large, “interpret” (as opposed to simply describing) really means identifying trends in the data.
Overall description The COVID vaccine survey gathered responses from medical and nursing students across the U.S. regarding their attitudes toward vaccine safety, trust, and recommendations. The visualization arranges responses in a grid, with response variables in columns and explanatory variables (like age, profession, or gender) in rows. Each pane displays the mean Likert score and error bars between the 10th and 90th percentiles for each subgroup, offering insight into both central tendency and variability. The top row summarizes overall distributions, unconditioned by explanatory factors.
-
Interesting Trends in the Data:
1. Trust and Profession:
Medical students displayed more variability in their agreement with the statement “I trust the information that I have received about the vaccines” compared to nursing students. While both groups leaned toward agreement, the broader spread among medical students suggests more diverse opinions, possibly reflecting deeper exposure to varying sources of information or a more analytical approach to evaluating it.
2. Concern About Side Effects and Age:
Across all age groups, responses to concerns about “safety and side effects” hovered around a neutral average (Likert score ≈ 3), with relatively wide error bars. This indicates uncertainty or ambivalence. However, younger students tended to show slightly higher trust (i.e. lower concern scores), suggesting that age may play a role in perceived vaccine risk.
3. There is perhaps some counter-intuition at play here among nursing students who responded to the question: “Based on my understanding, I believe this vaccine is safe.”
The 10–90th percentile bars span the entire Likert scale, suggesting considerable variability in responses. While this reflects some uncertainty, it may stem from the ambiguity of what understanding means in this context. Perhaps students interpret understanding as requiring a solid grasp of virology—something not all nursing students may have studied in depth. Alternatively, they may associate it with the rapid development timeline of the vaccine, leading to concerns about whether it was produced safely. It seems unlikely, however, that true denial of vaccine safety is the dominant interpretation among nursing students.

Overall, the data reveal meaningful variation in how medical and nursing students interpret the science and safety of COVID-19 vaccines, highlighting the complexity of attitudes even within healthcare education.

Code
#------- no code necessary ..

3 - COVID survey - reconstruct

Q3 ….

Data Analysis - Q1
📄 The original data frame (raw_preview) has:
- 1123 rows
- 14 columns
Table 1. Dataset Missing Value Diagnostics
Metric Value
Total % of values missing 8.65
Percent of rows with ≥1 NA 17.4
Row indices with ≥1 NA (first 20) 3, 37, 41, 53, 61, 67, 76, 88, 98, 103, 114, 115, 116, 118, 150, 151, 152, 153, 154, 155
Percent of rows with >1 NA 12.13
Row indices with >1 NA (first 20) 3, 53, 114, 115, 116, 118, 151, 152, 153, 154, 155, 208, 216, 299, 343, 345, 346, 347, 374, 403
✅ Rows with only `response_id` and all other fields missing have been removed.
Original dataset rows: 1121
Rows removed: 10
Cleaned dataset size: 1111 rows × 14 columns


**Rows_Removed**
row:3 row:152 row:153 row:414
row:529 row:556 row:577 row:835
row:987 row:1050
Code
# - Step 1a: print the dim of the original df.
original_dim <- dim(raw_preview)

cat(glue(
  "📄 The original data frame (`raw_preview`) has:\n",
  "- {original_dim[1]} rows\n",
  "- {original_dim[2]} columns\n\n",
  "⚠️  Rows with no available data (i.e., only `response_id` present)\n will be removed in preprocessing.\n",
  "\n✅ **New Dimensions** of `survey_clean` after cleaning:\n",
  "📊 Rows: {nrow(survey_clean)}\n",
  "📐 Columns: {ncol(survey_clean)}\n"
))
📄 The original data frame (`raw_preview`) has:
- 1123 rows
- 14 columns

⚠️  Rows with no available data (i.e., only `response_id` present)
 will be removed in preprocessing.

✅ **New Dimensions** of `survey_clean` after cleaning:
📊 Rows: 1111
📐 Columns: 14
Code
#-- ... --- based on info in pdf file and .csv .. encode the following
# exp_profession........... 
# exp_flu_vax.............. 
# exp_gender............... Q2 What is your gender? 
# exp_race................. Q3 What is your race?   
# exp_ethnicity............ Q4 What is your ethnicity?  
# exp_age_bin.............. Q1 What is your age?    
# exp_already_vax.......... 
# resp_safety.............. Q26 Based on my understanding, I believe the vaccine is safe.   
# resp_confidence_science.. Q34 I am confident in the scientific vetting process for the new COVID vaccines.    
# resp_concern_safety...... Q27 I am concerned about the safety and side effects of the vaccine.    
# resp_feel_safe_at_work... Q28 Getting the vaccine will make me feel safer at work.    
# resp_will_recommend...... Q29 I will recommend the vaccine to family, friends, and community members. 
# resp_trust_info.......... Q31 I trust the information that I have received about the COVID-19 vaccines.

covid_survey_longer <- survey_clean |>
  pivot_longer(
    cols = starts_with("exp_"),
    names_to = "explanatory",
    values_to = "explanatory_value"
  ) |>
  mutate(explanatory_value = as.factor(explanatory_value)) |>
  filter(!is.na(explanatory_value)) |>
  pivot_longer(
    cols = starts_with("resp_"),
    names_to = "response",
    values_to = "response_value"
  )

print(covid_survey_longer)
# A tibble: 43,428 × 5
   response_id explanatory    explanatory_value response         
         <dbl> <chr>          <fct>             <chr>            
 1           1 exp_profession 1                 resp_safety      
 2           1 exp_profession 1                 resp_confidence_…
 3           1 exp_profession 1                 resp_concern_saf…
 4           1 exp_profession 1                 resp_feel_safe_a…
 5           1 exp_profession 1                 resp_will_recomm…
 6           1 exp_profession 1                 resp_trust_info  
 7           1 exp_flu_vax    1                 resp_safety      
 8           1 exp_flu_vax    1                 resp_confidence_…
 9           1 exp_flu_vax    1                 resp_concern_saf…
10           1 exp_flu_vax    1                 resp_feel_safe_a…
# ℹ 43,418 more rows
# ℹ 1 more variable: response_value <dbl>
Q3 code explanation:

covid_survey_longer <- covid_survey |>
pivot_longer(
cols = starts_with(“exp_”),
names_to = “explanatory”,
values_to = “explanatory_value”
) |>
filter(!is.na(explanatory_value)) |>
pivot_longer(
cols = starts_with(“resp_”),
names_to = “response”,
values_to = “response_value”
)


first pivot_longer():
Converts all columns that start with “exp_” (e.g., exp_profession, exp_gender, etc.) from wide format into long format.
Creates two new columns:
explanatory: holds the original column names (like “exp_profession”)
explanatory_value: holds the actual values from those columns (like “Nursing” or “1”)
second pivot_longer():
After already pivoting the explanatory variables, this takes the remaining
response variables (resp_safety, resp_confidence_science, etc.) and pivots them long as well.
Creates two new columns:
response: original column name
response_value: corresponding value

create the df/tibble: covid_survey_summary_stats_by_group

Code
# - group the data - by explanatory, explanatory_value, and response calc.
# - the following stats:
# - mean of the response_value
# - low 10th percentile of the response_value
# - high 90th percentile of the response_value
# - rename the df coivd_survey_summart_stats_by_group
covid_survey_summary_stats_by_group <- covid_survey_longer |>
  group_by(explanatory, explanatory_value, response) |>
  summarise(
    mean = mean(response_value, na.rm = TRUE),
    low = quantile(response_value, probs = 0.10, na.rm = TRUE),
    high = quantile(response_value, probs = 0.90, na.rm = TRUE),
    .groups = "drop"
  )

print(covid_survey_summary_stats_by_group)
# A tibble: 126 × 6
   explanatory explanatory_value response        mean   low  high
   <chr>       <fct>             <chr>          <dbl> <dbl> <dbl>
 1 exp_age_bin 0                 resp_concern_…  3.35     2   4.4
 2 exp_age_bin 0                 resp_confiden…  1.65     1   2.4
 3 exp_age_bin 0                 resp_feel_saf…  1.71     1   3.8
 4 exp_age_bin 0                 resp_safety     1.41     1   2  
 5 exp_age_bin 0                 resp_trust_in…  1.41     1   2  
 6 exp_age_bin 0                 resp_will_rec…  1.35     1   1.8
 7 exp_age_bin 20                resp_concern_…  3.32     2   5  
 8 exp_age_bin 20                resp_confiden…  1.31     1   2  
 9 exp_age_bin 20                resp_feel_saf…  1.20     1   2  
10 exp_age_bin 20                resp_safety     1.95     1   5  
# ℹ 116 more rows
Code
#View(covid_survey_summary_stats_by_group)

create the df/tibble: covid_survey_summary_stats_all

Code
library(dplyr)

covid_survey_summary_stats_all <- covid_survey_longer |>
  group_by(response) |>
  summarise(
    mean = mean(response_value, na.rm = TRUE),
    low = quantile(response_value, probs = 0.10, na.rm = TRUE),
    high = quantile(response_value, probs = 0.90, na.rm = TRUE),
    explanatory = "All",
    explanatory_value = factor(""),
    .groups = "drop"
  )

print(covid_survey_summary_stats_all)
# A tibble: 6 × 6
  response         mean   low  high explanatory explanatory_value
  <chr>           <dbl> <dbl> <dbl> <chr>       <fct>            
1 resp_concern_s…  3.28     1     5 All         ""               
2 resp_confidenc…  1.43     1     2 All         ""               
3 resp_feel_safe…  1.36     1     2 All         ""               
4 resp_safety      2.03     1     5 All         ""               
5 resp_trust_info  1.40     1     2 All         ""               
6 resp_will_reco…  1.21     1     2 All         ""               
Code
#View(covid_survey_summary_stats_all)

Bind the two df’s
create the df/tibble: covid_summary_of_stats

Code
# Get existing levels from grouped data
age_levels <- levels(covid_survey_summary_stats_by_group$explanatory_value)

# Add a new level to represent the 'All' group
age_levels_with_all <- c(age_levels, "")

# Create the all-summary with the new factor level
covid_survey_summary_stats_all <- covid_survey_longer |>
  group_by(response) |>
  summarise(
    mean = mean(response_value, na.rm = TRUE),
    low = quantile(response_value, probs = 0.10, na.rm = TRUE),
    high = quantile(response_value, probs = 0.90, na.rm = TRUE),
    explanatory = "All",
    explanatory_value = factor("", levels = age_levels_with_all),
    .groups = "drop"
  )

# Ensure grouped summary has the same levels too
covid_survey_summary_stats_by_group$explanatory_value <- factor(
  covid_survey_summary_stats_by_group$explanatory_value,
  levels = age_levels_with_all
)

# Bind them safely now
covid_survey_summary_stats <- bind_rows(
  covid_survey_summary_stats_all,
  covid_survey_summary_stats_by_group
)

print(covid_survey_summary_stats)
# A tibble: 132 × 6
   response        mean   low  high explanatory explanatory_value
   <chr>          <dbl> <dbl> <dbl> <chr>       <fct>            
 1 resp_concern_…  3.28     1   5   All         ""               
 2 resp_confiden…  1.43     1   2   All         ""               
 3 resp_feel_saf…  1.36     1   2   All         ""               
 4 resp_safety     2.03     1   5   All         ""               
 5 resp_trust_in…  1.40     1   2   All         ""               
 6 resp_will_rec…  1.21     1   2   All         ""               
 7 resp_concern_…  3.35     2   4.4 exp_age_bin "0"              
 8 resp_confiden…  1.65     1   2.4 exp_age_bin "0"              
 9 resp_feel_saf…  1.71     1   3.8 exp_age_bin "0"              
10 resp_safety     1.41     1   2   exp_age_bin "0"              
# ℹ 122 more rows

Q3e - recreate plot

Code
# Labels for rows (explanatory variables), including Gender and Race
explanatory_labels <- c(
  All = "All",
  exp_age_bin = "Age",
  exp_gender = "Gender",
  exp_race = "Race"    ,# Added Race label
  exp_ethnicity = "Ethnicity"
)

# - call formatting for encoded data
covid_age_only <- filter_age_data(covid_survey_summary_stats_by_group)

covid_gender_only <- filter_gender_data(covid_survey_summary_stats_by_group)

covid_race_only <- filter_race_data(covid_survey_summary_stats_by_group)

covid_ethnicity_only <- filter_ethnicity_data(covid_survey_summary_stats_by_group)

# Label mappings for response
response_labels <- c(
  resp_safety = "Vaccine is safe",
  resp_feel_safe_at_work = "Feel safer\n at work",
  resp_concern_safety = "Concern about \nvaccine safety",
  resp_confidence_science = "Confidence in \nscientific vetting",
  resp_trust_info = "Trust in \nvaccine info",
  resp_will_recommend = "Will recommend\nvaccine"
)

# Reorder response factor levels to match response_labels
covid_age_only <- covid_age_only %>%
  mutate(response = factor(response, levels = names(response_labels)))

covid_gender_only <- covid_gender_only %>%
  mutate(response = factor(response, levels = names(response_labels)))

# View distinct codes used in the exp_ethnicity variable
covid_survey_summary_stats_by_group %>%
  filter(explanatory == "exp_ethnicity") %>%
  mutate(explanatory_value = as.character(explanatory_value)) %>%
  distinct(explanatory_value) %>%
  arrange(explanatory_value)
# A tibble: 2 × 1
  explanatory_value
  <chr>            
1 1                
2 2                
Code
# Vector controlling heights of each row - add height for race
row_heights <- c(
  0.5,  # - 'All' row height — adjust as needed
  3,    # - 'exp_age_bin' row height
  3,    # - 'exp_gender' row height - adjust as desired
  3,    # - 'exp_race' row height - new Race row
  3     # - ethnicity
)

# Reorder response factor levels for 'All' layer
covid_all_only <- covid_survey_summary_stats_all %>%
  filter(is.finite(mean), is.finite(low), is.finite(high)) %>%
  mutate(response = factor(response, levels = names(response_labels)))

# - vars for standardizing box size row/col
# Define variables for strip appearance
strip_fill_color <- "gray90"
strip_text_color <- "black"
strip_text_size <- 10
strip_text_face <- "plain"
strip_text_angle_x <- 0
strip_text_angle_y <- 0
strip_text_vjust_y <- 0.5
strip_placement <- "outside"  # already used in your code


# Call some functions
g0 <- plot_all(covid_all_only)
# - second layer - Age
g1 <- plot_age(covid_age_only)
# - third layer - gender
g2 <- plot_gender(covid_gender_only)
# Fourth layer - Race
g3 <- plot_race(covid_race_only)
# Fifth layer: Ethnicity (if present)
g4 <- plot_ethnicity(covid_ethnicity_only,"Error bars in range from 10th to 90th percentile")

# Composite plot with 5 layers stacked (All / Age / Gender / Race / Ethnicity)
composite_plot <- (g0 / g1 / g2 / g3 / g4 + plot_layout(heights = row_heights)) &
  theme(plot.margin = margin(0, 0, 0, 0))

print(composite_plot)

4 - COVID survey - re-reconstruct

Q4 ….Make Plot from Q3, but use different end point quarantiles.
When the error bars represent the 25th and 75th percentiles instead of the 10th and 90th, the intervals become narrower, reflecting a tighter range around the median of the data. This change reduces the apparent variability and uncertainty in responses. Compared to the previous plot, the shorter error bars may make the group differences appear more precise but potentially understate the true variability. Therefore, while the overall trends remain similar, conclusions about the degree of uncertainty should be adjusted to recognize that the interquartile range excludes more extreme values.

# A tibble: 132 × 6
   response        mean   low  high explanatory explanatory_value
   <chr>          <dbl> <dbl> <dbl> <chr>       <fct>            
 1 resp_concern_…  3.28     2     4 All         ""               
 2 resp_confiden…  1.43     1     2 All         ""               
 3 resp_feel_saf…  1.36     1     1 All         ""               
 4 resp_safety     2.03     1     3 All         ""               
 5 resp_trust_in…  1.40     1     2 All         ""               
 6 resp_will_rec…  1.21     1     1 All         ""               
 7 resp_concern_…  3.35     2     4 exp_age_bin "0"              
 8 resp_confiden…  1.65     1     2 exp_age_bin "0"              
 9 resp_feel_saf…  1.71     1     2 exp_age_bin "0"              
10 resp_safety     1.41     1     2 exp_age_bin "0"              
# ℹ 122 more rows

5 - COVID survey - another view

Q5a ….
COVID survey - another view. Create two bar charts of the Likert data for the six survey questions in from the plot in Exercise 2. This should be a single plot visualizing the percentages of each possible answer, with different questions on the y-axis. Use an appropriate color scale.


a. Create a diverging bar chart. Write alt text for your visualization as well.
Write alt text for your visualization as well.

Code
# - Using Likert Data for 6 survey questions, create a diverging bar chart.


# Response labels (questions)
response_labels <- c(
  resp_safety = "Vaccine is safe",
  resp_feel_safe_at_work = "Feel safer at work",
  resp_concern_safety = "Concern re : vaccine side effects",
  resp_confidence_science = "Confidence in scientific vetting",
  resp_trust_info = "Trust in vaccine info",
  resp_will_recommend = "Will recommend vaccine"
)

# Likert response mapping
likert_scores <- c(
  "1" = "Strongly Agree",
  "2" = "Somewhat Agree",
  "3" = "Neither Agree Nor Disagree",
  "4" = "Somewhat Disagree",
  "5" = "Strongly Disagree"
)

# Your custom viridis palette (turquoise to yellow-green)
my_turquoise_to_yellow <- viridis(
  12,
  begin = 0.25,
  end = 0.85,
  option = "viridis"
)[c(1, 3, 5, 7, 9, 11)]


# 1. Clean and assign numeric scores for mean computation
likert_means <- covid_survey_longer %>%
  filter(
    response %in% names(response_labels),
    !is.na(response_value),
    response_value != ""
  ) %>%
  distinct(response_id, response, .keep_all = TRUE) %>%
  mutate(
    response_label = factor(response_labels[response], levels = response_labels),
    response_numeric = as.numeric(response_value)
  ) %>%
  group_by(response_label) %>%
  summarize(
    mean_score = mean(response_numeric, na.rm = TRUE),
    sd_score = sd(response_numeric, na.rm = TRUE),
    n = n()
  ) %>%
  ungroup()

# 2. Calculate overall mean for centering
overall_mean <- mean(likert_means$mean_score, na.rm = TRUE)

# 3. Assign color scale (optional: color by mean score direction)
likert_means <- likert_means %>%
  mutate(
    centered_score = mean_score - overall_mean,
    direction = ifelse(centered_score < 0, "positive", "negative")
  )

# Add magma color mapping for Likert scores
likert_colors <- viridis::viridis(5, option = "magma")

# Prepare Likert labels with color
likert_xlabels <- data.frame(
  likert_score = 1:5,
  centered_score = 1:5 - overall_mean,
  color = likert_colors,
  label = as.character(1:5)
)
# Pick a magma tone (e.g., middle of the scale)
magma_color <- viridis::viridis(1, option = "magma")

g_centered <- ggplot(likert_means, aes(x = centered_score, y = fct_rev(response_label), fill = direction)) +
  geom_col(width = 0.6) +
  annotate(
    "segment",
    x = 0, xend = 0,
    y = 0.5, yend = 6.5,  # Adjust y range to control vertical span
    linetype = "dashed",
    color = "black",
    linewidth = 0.8
  ) +
  scale_x_continuous(
    name = "Deviation from Overall Mean Likert Score",
    limits = c(-1.5, 1.5),
    breaks = seq(-1.5, 1.5, by = 0.5),
    sec.axis = dup_axis(name = NULL, labels = NULL)
  ) +
  scale_y_discrete(
    expand = expansion(mult = c(0.05, 0.32))
  ) +
  scale_fill_manual(
    values = c(positive = my_turquoise_to_yellow[2], negative = my_turquoise_to_yellow[5]),
    name = "Likert Score Relation",
    labels = c(positive =  "Below Mean", negative = "Above Mean")
  ) +
  labs(
    title = "Mean-Centered Likert Scores by Question\n",
    y = NULL
  ) +
  annotate(
    "segment",
    x = 0.45, xend = 0,
    y = 1.3, yend = 1.6,
    color = "black",
    arrow = arrow(length = unit(0.2, "cm")),
    linewidth = 0.6
  ) +
  annotate(
    "label",
    x = 0.5, y = 1.1,
    label = sprintf("Mean = %.2f", overall_mean),
    hjust = 0,
    vjust = 0,
    size = 4.5,
    fill = "white",
    color = "black",
    label.size = 0.4,
    label.r = unit(0.15, "lines"),
    label.padding = unit(0.3, "lines")
  ) +
  # Likert numbers colored with magma palette
  geom_text(
    data = likert_xlabels,
    aes(x = centered_score, y = 6.8, label = likert_score),
    inherit.aes = FALSE,
    size = 7,
    color = likert_colors
  ) +
  # Likert Scores label colored in magma (middle color)
  annotate(
    "text",
    x = 0, y = 7.4,
    label = "Likert Scores",
    size = 7,
    fontface = "italic",
    color = likert_colors[3]
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 16, face = "bold", hjust = 0.5),
    axis.text.y = element_text(size = 15, lineheight = 2),
    axis.text.x = element_text(size = 14),
    axis.title.x.top = element_text(size = 10, color = "gray30", face = "italic", hjust = 1),
    legend.position = "bottom",
    legend.title = element_text(size = 12, face = "bold"),
    legend.text = element_text(size = 11),
    plot.margin = margin(t = 10, r = 10, b = 10, l = 10)
  )

plot(g_centered)

Q5a - Alt text
A horizontal diverging bar chart presents mean-centered Likert scores for six COVID vaccine survey questions: “Vaccine is safe,” “Feel safer at work,” “Concern re : vaccine side effects,” “Confidence in scientific vetting,” “Trust in vaccine info,” and “Will recommend vaccine.” Each bar extends from a vertical midpoint at the mean response (1.79) to show how average responses deviate above or below the overall mean, with greater lengths indicating stronger deviation.
Turquoise segments indicate questions with more agreement (e.g., stronger confidence or willingness to recommend), while yellow-green segments reflect comparatively more skepticism. The chart includes an annotated mean marker, and Likert scale values from “1 - Strongly Agree” to “5 - Strongly Disagree” appear along the top axis in magma tones to reinforce the interpretive range.
All questions except those concerning vaccine safety and side effects tend towards ‘1 - Strongly Agree.’ The scores for ‘Vaccine is safe’ and ‘concerns about vaccine side effects’, while above the mean, did not score into Likert-4 or Likert-5, indicating uncertainty or abmivalence but not concern.

Q5b ….
COVID survey - another view. Create two bar charts of the Likert data for the six survey questions in from the plot in Exercise 2. This should be a single plot visualizing the percentages of each possible answer, with different questions on the y-axis. Use an appropriate color scale.
b. Create a 100% bar chart
Write alt text for your visualization as well.

Code
# Full Likert labels - for the legend
likert_labels_full <- c(
  "1" = "1 - Strongly Agree",
  "2" = "2 - Somewhat Agree",
  "3" = "3 - Neither Agree Nor Disagree",
  "4" = "4 - Somewhat Disagree",
  "5" = "5 - Strongly Disagree"
)

# Filter and prepare proportions
likert_props <- covid_survey_longer %>%
  filter(response %in% names(response_labels), !is.na(response_value), response_value != "") %>%
  mutate(
    response_label = factor(response_labels[response], levels = response_labels),
    likert_level = factor(likert_labels_full[response_value], levels = likert_labels_full)
  ) %>%
  group_by(response_label, likert_level) %>%
  summarize(n = n(), .groups = "drop") %>%
  group_by(response_label) %>%
  mutate(percentage = n / sum(n) * 100)

# Build 100% stacked bar chart with full Likert labels
g5b <- ggplot(likert_props, aes(x = percentage, y = fct_rev(response_label), fill = likert_level)) +
  geom_col(width = 0.7) +
  scale_fill_viridis_d(
    option = "viridis",
    begin = 0.2,
    end = 0.85,
    direction = -1,
    name = "Response"
  ) +
  labs(
    title = "Distribution of Likert Responses by Question",
    x = "Percentage",
    y = NULL
  ) +
  theme_minimal() +
  theme(
    plot.title = element_text(size = 18, face = "bold", hjust = 0.5),
    legend.position = "right",
    legend.title = element_text(size = 12, face = "bold"),
    legend.text = element_text(size = 11),
    axis.text.y = element_text(size = 14),
    axis.text.x = element_text(size = 14)
  )

plot(g5b)

Q5b - Alt text
A horizontal stacked bar chart displays the distribution of Likert-scale responses to six survey questions on COVID vaccine attitudes, including: “Vaccine is safe,” “Feel safer at work,” “Concern re: vaccine side effects,” “Confidence in scientific vetting,” “Trust in vaccine info,” and “Will recommend vaccine.”
Each bar represents a question, divided into colored segments for each response level from “1 - Strongly Agree” to “5 - Strongly Disagree,” using a perceptually uniform Viridis palette. Most responses cluster on the agreement end of the scale, especially for trust and confidence statements, while the item “Concern about vaccine safety” shows a more neutral distribution centered around “Somewhat Agree” and “Neither Agree Nor Disagree.”
The visual design emphasizes sentiment distribution and is optimized for accessibility, including colorblind-friendly contrast and consistent labeling.